import pandas as pd
import json
from collections import Counter
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import account_util as ut
df = pd.read_csv('../tweets_novax.csv',low_memory=False,
usecols=['user_id','user_created_at','user_screen_name','user_mentions','created_at',
'user_verified','rt_created_at','in_reply_to_screen_name','rt_user_id','rt_user_screen_name',
'is_self_rt','user_url_cred'])
df['user_created_at'] = pd.to_datetime(df['user_created_at'], format="%a %b %d %X %z %Y")
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1328997 entries, 0 to 1328996 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 created_at 1328997 non-null datetime64[ns, UTC] 1 user_id 1328997 non-null int64 2 user_screen_name 1328997 non-null object 3 user_verified 1328997 non-null bool 4 user_created_at 1328997 non-null datetime64[ns, UTC] 5 in_reply_to_screen_name 151522 non-null object 6 rt_created_at 999200 non-null object 7 rt_user_id 999200 non-null float64 8 rt_user_screen_name 999200 non-null object 9 user_mentions 1328997 non-null object 10 is_self_rt 1328997 non-null bool 11 user_url_cred 851951 non-null float64 dtypes: bool(2), datetime64[ns, UTC](2), float64(2), int64(1), object(5) memory usage: 103.9+ MB
dfAccount = pd.DataFrame()
df1 = df.groupby(['user_screen_name']).sum()
df1.loc[:,df1.columns[df1.columns.str.contains('id', regex=False)==False]]
dfAccount['user_verified'] = df1['user_verified']>0
dfAccount['created_at'] = df.groupby('user_screen_name').first()['user_created_at']
#How many users verify the account (absolute)
values = [sum(dfAccount['user_verified']), len(dfAccount) - sum(dfAccount['user_verified'])]
names = ['Yes', 'No']
fig = px.pie(values=values, names=names)
fig.update_layout(title="Is the user verified?")
fig.show()
len(dfAccount)
8118
dfCountUserCreation = dfAccount.resample('W', on='created_at').count().iloc[:,1].to_frame()
dfCountUserCreation.rename(columns={"created_at": "count"},inplace=True)
fig = px.histogram(dfCountUserCreation,x=dfCountUserCreation.index,y='count',
title='User creation distribution',nbins=100)
fig.update_yaxes(title='count')
fig.update_xaxes(title='date')
fig.show()
Dal grafico sovrastante, si possono notare due picchi nel 2012 e nel 2020 importanti e una crescita importante del numero di profili creati nell 2009. La prima crescita, che risale appunto al 2009, è dovuta all'introduzione (insieme a francese, tedesco e spagnolo) dell'italiano, come lingua attiva.
Dopodiché, nel 2012, twitter ha per la prima volta battuto una notizia di rilevanza istituzionale con largo anticipo rispetto ai media tradizionali: la morte del presidente emerito Oscar Luigi Scalfaro. Questo ha portato ad un aumento, sopratutto in Italia delle iscrizioni a Twitter.
Infine, verso fine marzo 2020, soprattutto in Italia, vi è stato un ferreo lockdown a causa del Covid. Questo lockdown, che ha costretto tutti a casa, ha portato ad un aumento di iscrizioni ai social, quindi anche Twitter,per riempire i momenti di noia.
#When the most user are created (since covid-19 started)
date = "2019-11-17" # 2020-11-17 --> The first case of infection ascertained by COVID-19 is recorded
df1 = dfAccount[(dfAccount['created_at']> date)]
dfCountUserCreation = df1.resample('W', on='created_at').count().iloc[:,1].to_frame()
dfCountUserCreation.rename(columns={"created_at": "count"},inplace=True)
fig = px.histogram(dfCountUserCreation,x=dfCountUserCreation.index,y='count',
title='User cration distribution (since covid-19 started)',nbins=100)
fig.update_yaxes(title='count')
fig.update_xaxes(title='date')
fig.show()
Concentrando la distribuzione dall primo caso di covid-19 registrato si possono notare tre picchi interessanti, a marzo 2020 (già notato nel grafico precedente), a novembre 2020 e dicembre 2020. I nuovi picchi che si possono notare, a differenza di quello già notato a marzo, riguarda una specifica settimana.
I picchi settimanali di novembre e dicembre, accadono propio in due momenti importanti importanti in italia, il primo, quello di novembre, capita durante la settimana del 3 novembre dove viene istituito un coprifuoco dalle 22:00 alle 05:00 per tutta l'Italia. Il secondo, invece, poco prima delle feste natalizie, quando, il governo ha istituito zona rossa nazionale.
In questi periodi, come per marzo, le persone ritrovandosi senza nulla da fare, hanno visto nei social come Twitter un passatempo e uno sfogo.
with open('../1_Dataset_preparation/listControlledUsers.json','r') as file_object:
data = json.load(file_object)
listNovax = data['Novax']
listLinkLow = data['link_low']
listProvax = data['Provax']
#Perparation i df grouped by name
dfTweetByName = pd.DataFrame(df.loc[:,'user_screen_name'])
dfTweetByName = ut.get_df_raggruped(dfTweetByName,'tweet_count','user_screen_name')
dfTweetByName = ut.add_user_type(dfTweetByName,listNovax,listProvax,listLinkLow)
dfTweetByName
| tweet_count | user_type | |
|---|---|---|
| user_screen_name | ||
| Piero42395724 | 4803 | Novax |
| IacobellisT | 4333 | Novax |
| Pietro_Otto | 4123 | Novax |
| TommyBrain | 4115 | Novax |
| Z3r0Rules | 4081 | Novax |
| ... | ... | ... |
| antoniopaddeu | 10 | Not defined |
| MinaAlessio | 10 | Not defined |
| AleMacchiavelli | 10 | Not defined |
| sunot_a | 10 | Not defined |
| Sentenza2020 | 10 | Not defined |
8118 rows × 2 columns
#Show the most active user (in general)
for i in (10,20,40,50):
ut.print_histogram_users(dfTweetByName,i,'tweet_count','Most %d active users'%i,'Count of tweets')
# get top 20 most frequent Tweet account
df_tweets = df[df['in_reply_to_screen_name'].isna()]
df_tweets = df_tweets[df_tweets['rt_created_at'].isna()]
df_tweets = pd.DataFrame(df_tweets.loc[:,'user_screen_name'])
df_tweets = ut.get_df_raggruped(df_tweets,'tweet_count','user_screen_name')
dfTweetByName = ut.add_user_type(df_tweets,listNovax,listProvax,listLinkLow)
df_tweets
| tweet_count | user_type | |
|---|---|---|
| user_screen_name | ||
| IacobellisT | 3814 | Novax |
| TommyBrain | 3463 | Novax |
| bisagnino | 2064 | Novax |
| Pietro_Otto | 920 | Novax |
| gabrillasarti2 | 877 | Not defined |
| ... | ... | ... |
| isaisaisais | 1 | Not defined |
| Bierbrauer53 | 1 | Not defined |
| Biene26294026 | 1 | Not defined |
| BiancoRinaldo | 1 | Not defined |
| Mr_Big_72 | 1 | Not defined |
5816 rows × 2 columns
ut.print_histogram_users(df_tweets,20,'tweet_count','Most %d users that create posts'%20,'Count of tweets')
df_retweet = pd.DataFrame()
df_retweet['all_rt'] = df[df['rt_created_at'].notna()].groupby('rt_user_screen_name').count()['user_id']
df_retweet['self_rt'] = df[df['rt_created_at'].notna()].groupby('rt_user_screen_name').sum()['is_self_rt']
df_retweet['real_rt'] = df_retweet['all_rt'] - df_retweet['self_rt']
df_retweet.sort_values('real_rt',ascending=False,inplace=True)
df_retweet = ut.add_user_type(df_retweet,listNovax,listProvax,listLinkLow)
df_retweet
| all_rt | self_rt | real_rt | user_type | |
|---|---|---|---|---|
| rt_user_screen_name | ||||
| MinervaMcGrani1 | 25683 | 0 | 25683 | Novax |
| BarbaraRaval | 19392 | 35 | 19357 | Novax |
| valy_s | 18594 | 1 | 18593 | Novax |
| ImolaOggi | 18128 | 0 | 18128 | Not defined |
| noitre32 | 16349 | 0 | 16349 | Novax |
| ... | ... | ... | ... | ... |
| Rosiserra727 | 1 | 1 | 0 | Not defined |
| EmidhiusWallace | 1 | 1 | 0 | Not defined |
| disicaterina | 1 | 1 | 0 | Not defined |
| Andrea_Olivieri | 1 | 1 | 0 | Not defined |
| megliobarbari | 3 | 3 | 0 | Not defined |
16875 rows × 4 columns
for i in (10,20,40,50):
ut.print_histogram_users(df_retweet,i,'real_rt','Most %d retweeted users'%i,'Count of retweets')
for i in (10,20,40,50):
fig = make_subplots(rows=1, cols=1)
if i <= 20:
fig.add_trace(go.Bar(y=df_retweet.head(i).index, x=df_retweet.head(i)['real_rt'],orientation='h',
name = 'All retweet',marker_color='#636EFA'), row=1, col=1)
fig.add_trace(go.Bar(y=df_retweet.head(i).index, x=df_retweet.head(i)['self_rt'],orientation='h',
name = 'Self retweet',marker_color='#EF553B'), row=1, col=1)
else:
fig = make_subplots(rows=1, cols=2)
n = i//2
fig.add_trace(go.Bar(y=df_retweet.head(i-n).index, x=df_retweet.head(i-n)['real_rt'],orientation='h',
name = 'All retweet',marker_color='#636EFA'), row=1, col=1)
fig.add_trace(go.Bar(y=df_retweet.head(i-n).index, x=df_retweet.head(i-n)['self_rt'],orientation='h',
name = 'Self retweet',marker_color='#EF553B'), row=1, col=1)
fig.add_trace(go.Bar(y=df_retweet.head(i).tail(n).index, x=df_retweet.head(i).tail(n)['real_rt'],orientation='h',
name = 'All retweet',marker_color='#636EFA'), row=1, col=2)
fig.add_trace(go.Bar(y=df_retweet.head(i).tail(n).index, x=df_retweet.head(i).tail(n)['self_rt'],orientation='h',
name = 'Self retweet',marker_color='#EF553B'), row=1, col=2)
fig.update_layout(title="The most %d frequent retweet account"%i)
fig.update_xaxes(title="Count of retweets")
fig.update_yaxes(title="Username")
fig.show()
df_replied = ut.get_df_raggruped(pd.DataFrame(df.loc[:,'in_reply_to_screen_name']),'reply_count','in_reply_to_screen_name')
df_replied = ut.add_user_type(df_replied,listNovax,listProvax,listLinkLow)
ut.print_histogram_users(df_replied,20,'reply_count','Most 20 replied users','Reply count')
retweet = sum(df_retweet['all_rt'])
reply = sum(df_replied['reply_count'])
tweet = len(df)-retweet-reply
values = [tweet,retweet,reply]
names = ['Tweets', 'Retweet','Reply']
fig = px.pie(values=values, names=names)
fig.update_layout(title="How are the tweets distribuited")
fig.update_traces(textinfo='value+percent')
fig.show()
#Creating a map of all mentioned users
listMention = []
for s in df['user_mentions']:
for val in eval(s):
listMention.append(val['screen_name'])
dfMentions = pd.DataFrame()
dfMentions['name'] = listMention
dfMentions['count'] = 0
dfMentions = dfMentions.groupby('name').count()
dfMentions.sort_values(['count'],axis = 0,inplace=True,ascending=False)
dfMentions = ut.add_user_type(dfMentions,listNovax,listProvax,listLinkLow)
dfMentions
| count | user_type | |
|---|---|---|
| name | ||
| MinervaMcGrani1 | 31274 | Novax |
| valy_s | 25722 | Novax |
| borghi_claudio | 22689 | Novax |
| BarbaraRaval | 21617 | Novax |
| ImolaOggi | 19932 | Not defined |
| ... | ... | ... |
| Attacco_d_pane | 1 | Not defined |
| VaccinesToday | 1 | Not defined |
| VacanzeMarrakeh | 1 | Not defined |
| AttilaAzureRive | 1 | Not defined |
| SulleyTruman | 1 | Not defined |
34149 rows × 2 columns
n = 20
#Show the most n mentioned user
ut.print_histogram_users(dfMentions,n,'count','Most %d mentioned users'%n,'Number of mention')
dfCountTweetCreation = df.resample('W', on='created_at').count().iloc[:,1].to_frame()
dfCountTweetCreation.rename(columns={"user_id": "count"},inplace=True)
fig = px.histogram(dfCountTweetCreation,x=dfCountTweetCreation.index,y='count',title='Tweets creation distribution'
,nbins=100)
fig.show()
df1 = df[df['user_url_cred'].notna()]
user_credibility = df1.loc[df1['user_url_cred']<1,'user_url_cred']
fig = px.histogram(x=user_credibility,
histnorm='percent',title="User credibility")
fig.show()